II Visualization of distributional data (“displot”)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false; // disable auto scrolling
}
penguins = sns.load_dataset("penguins")
penguins.head()
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex
0 Adelie Torgersen 39.1 18.7 181.0 3750.0 Male
1 Adelie Torgersen 39.5 17.4 186.0 3800.0 Female
2 Adelie Torgersen 40.3 18.0 195.0 3250.0 Female
3 Adelie Torgersen NaN NaN NaN NaN NaN
4 Adelie Torgersen 36.7 19.3 193.0 3450.0 Female

Histogram with continuous data

sns.displot(penguins,
            x="flipper_length_mm")

sns.displot(penguins,
            x="flipper_length_mm",
            binwidth=7.1)

sns.displot(penguins,
            x="flipper_length_mm",
            bins=20)

Bindwidths too small can break histograms

sns.displot(penguins, x="flipper_length_mm",
            binwidth=0.3)

sns.displot(penguins,
            x="flipper_length_mm",
            binwidth=30) # binwdith too big, the two hills in the data are not visible

sns.displot(penguins,
            x="flipper_length_mm",
            bins=15)

Histogram with discrete data (“party size”)

tips = sns.load_dataset("tips")
tips.head()
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
sns.displot(tips,
            x="size",
            discrete=True)

Histogram with discrete data (weekdays)

sns.displot(tips,
            x="day")
# no need to specify discrete=True beacuse seaborn figures it out on its own

Distribution of data differentiated based on categorical variable

sns.displot(penguins,
            x="flipper_length_mm",
            hue="species")

sns.displot(penguins,
            x="flipper_length_mm",
            hue="species",
            col='island')

Histogram stacking versus histogram overlap

With stacking:

sns.displot(penguins,
            x="flipper_length_mm",
            hue="species",
            multiple="stack")

Histogram stacking versus histogram overlap versus dodge

With dodging:

sns.displot(penguins,
            x="flipper_length_mm",
            hue="species",
            multiple="dodge")

Different subplots for different value on a categorical variable

sns.displot(penguins,
            x="flipper_length_mm",
            col="sex")

sns.displot(penguins,
            x="flipper_length_mm",
            col="sex",
            hue='species',
            row='island',
            multiple="dodge")

Kernel Density Estimation (KDE) plots to smooth histograms

sns.displot(penguins,
            x="flipper_length_mm",
            kind="kde")

sns.displot(penguins,
            x="flipper_length_mm",
            kind="kde",
            bw_method=0.05) # setting the bandwidth
# overfitting
# curve is jittery and the jitter is from noise, bandwidth is too small

sns.displot(penguins,
            x="flipper_length_mm",
            kind="kde",
            bw_method=0.3) # setting the bandwidth

sns.displot(penguins,
            x="flipper_length_mm",
            kind="kde",
            bw_method=2) # setting the bandwidth
# underfitting:
# bandwidth too big, curve too smoothed out, not informative

sns.displot(penguins,
            x="flipper_length_mm",
            hue="species",
            kind="kde")

sns.displot(penguins,
            x="flipper_length_mm",
            hue="species",
            col='island',
            kind="kde")

sns.displot(penguins,
            x="flipper_length_mm",
            hue="species",
            kind="kde",
            fill=True)

sns.displot(penguins,
            x="flipper_length_mm",
            hue="species",
            kind="kde",
            fill=True,
            multiple="stack")

2-dimensional distributional plots

Histograms in 2d (also known as heatmap)

sns.displot(penguins,
            x="bill_length_mm",
            y="bill_depth_mm")

sns.displot(penguins,
            x="bill_length_mm",
            y="bill_depth_mm",
            cbar=True) # adding a colorbar

sns.jointplot(penguins,
            x="bill_length_mm",
            y="bill_depth_mm",
            kind='hex')

KDE plots in 2d

sns.displot(penguins,
            x="bill_length_mm",
            y="bill_depth_mm",
            kind="kde")

Controlling the number of isolines and the threshold for the smallest isoline

sns.displot(penguins,
            x="bill_length_mm",
            y="bill_depth_mm",
            kind="kde",
            levels=12,
            thresh=0.02)

2d histograms differentiated with colors for different species

sns.displot(penguins,
            x="bill_length_mm",
            y="bill_depth_mm",
            hue="species",
            col='island')

2d KDE plots differentiated with colors for different species

sns.displot(penguins,
            x="bill_length_mm",
            y="bill_depth_mm",
            hue="species",
            col='island',
            kind="kde")

Changing binwidth (in two diretions)

sns.displot(penguins,
            x="bill_length_mm",
            y="bill_depth_mm",
            binwidth=(3, 1))

Visualizing 2d distributions and 1d marginals with sns.jointplot()

sns.jointplot(data=penguins,
              x="bill_length_mm",
              y="bill_depth_mm",
              marker='X'
             )

sns.jointplot(data=penguins,
              x="bill_length_mm",
              y="bill_depth_mm",
              kind='hist'
             )

sns.jointplot(data=penguins,
              x="bill_length_mm",
              y="bill_depth_mm",
              hue='species',
              kind='kde'
             )

visualizing 2d distributions and 1d marginals

sns.jointplot(
    data=penguins,
    x="bill_length_mm",
    y="bill_depth_mm",
    hue="species",
    kind="kde"
)

sns.jointplot(penguins,
            x="bill_length_mm",
            y="bill_depth_mm",
            hue="species",
            kind="kde")

Rug: visualizing 2d dist AND 1d locations of single points

Multiple layers: for instance, both scatter plot and KDE plots, both rugs and marginal plots

g = sns.jointplot(data=penguins,
                  x="bill_length_mm",
                  y="bill_depth_mm")

g.plot_joint(sns.kdeplot,
             color="red")

# scatter plot in blue
g = sns.jointplot(data=penguins,
                  x="bill_length_mm",
                  y="bill_depth_mm")

# kde plot in red, same plot
g.plot_joint(sns.kdeplot,
             color="red")

# rug plot in green
g.plot_marginals(sns.rugplot,
                 color="green", height=0.15)